CATMULL_SHIFT  equ 8
ROUND equ 8
;include "labs.inc"  ; dbg / profiling purpose
horizontal_tex_grd_line:
; procedure drawing textured line with Gouraud shading
; or Gouraud only horizontal line
; in tex cause bilinear filtering implamented
;
; input:  - glass tex horizontal line calling convention
;
;    xmm0 - normal vector 1 ; col1 - floats
;    xmm1 - normal vect 2   ; col2 - floats
;    xmm3 - lo -> hi tx1, ty1, z1 coords as dwords float
;    xmm5 - lo -> hi tx2, ty2, z2 coords as dwords float
;    xmm2 - lo -> hi y_min, y_max, x_min, x_max
;           as dword integers

;    eax  - x1
;    ebx  - x2
;    ecx  - y
;    edi  - screen buffer
;    esi  - stencil buffer filled with dd floats
;    edx  - texture pointer (handle)
;    xmm6 - lo -> hi dword x_res, tex_shift, tex_x * 4,
;          tex size as dword integers

    pcmpeqd  xmm7,xmm7
    movaps   xmm4,xmm3
    psrldq   xmm7,4      ; [zero_hgst]
    movlhps  xmm4,xmm5

    andps    xmm0,xmm7
    andps    xmm1,xmm7
    andps    xmm3,xmm7
    andps    xmm5,xmm7
    shufps   xmm3,xmm3,10111111b
    shufps   xmm5,xmm5,10111111b
    psrld    xmm7,31
    orps     xmm0,xmm3
    cvtdq2ps xmm7,xmm7    ; [the_one]
    orps     xmm1,xmm5


    .x1      equ  word[ebp-2]
    .x2      equ  word[ebp-4]
    .dz      equ dword[ebp-8]
    .db      equ      [ebp-12]
    .dg      equ dword[ebp-16]
    .dr      equ      [ebp-20]
    .dtex_x  equ dword[ebp-24]
    .dtex_y  equ      [ebp-28]
    .dr_cmp  equ      [ebp-32]
    .dr_flag2     equ [ebp-36]
    .dr_flag1     equ [ebp-38]
;    .z2      equ      [ebp-32]
;    .b2      equ      [ebp-36]
;    .g2      equ      [ebp-40]
;    .r2      equ      [ebp-44]
    .z1      equ      [ebp-48]
    .b1      equ      [ebp-52]
    .g1      equ      [ebp-56]
    .r1      equ      [ebp-60]

    .tex_x2  equ      [ebp-64]
    .tex_y2  equ      [ebp-68]
    .tex_x1  equ      [ebp-72]
    .tex_y1  equ      [ebp-76]

    .y_min   equ      [ebp-84]
    ; .bi_fil  equ [ebp-84]
    .y_max   equ dword[ebp-80]
    .y       equ dword[ebp-88]
    .tex_ptr equ dword[ebp-92]
    .tex_size     equ dword[ebp-96]
    .correct_texf equ dword[ebp-104]
    .x_res        equ [ebp-124]
    .tex_shift    equ [ebp-120]
    .tex_x4       equ [ebp-116]
    .tex_size     equ [ebp-112]
    .the_one      equ [ebp]

        push      ebp
        mov       ebp,esp
        and       ebp,-16
        sub       ebp,16
        sub       esp,170
        mov       .y,ecx
        movlps    .the_one,xmm7
        mov       .tex_ptr,edx
        movlps    .y_min,xmm2
        movups    .x_res,xmm6
        mov       .x1,ax
        mov       .x2,bx
        movups    .r1,xmm0
        movups    .tex_y1,xmm4
        sub       ebx,eax
        subps     xmm1,xmm0
        cvtsi2ss  xmm3,ebx
        movhlps   xmm5,xmm4
        shufps    xmm3,xmm3,0
        subps     xmm5,xmm4
        rcpps     xmm3,xmm3
        mulps     xmm5,xmm3
        mulps     xmm1,xmm3
        movups    .dr,xmm1
        movlps    .dtex_y,xmm5
        xor       edx,edx
        cmp       .x1,dx
        jge       @f
        movsx     eax,.x1
        neg       eax
        cvtsi2ss  xmm2,eax
        shufps    xmm2,xmm2,0
        mulps     xmm1,xmm2     ; * delta
        addps     xmm0,xmm1     ; + x1
        movups    .r1,xmm0
        mulps     xmm5,xmm2
        addps     xmm4,xmm5
        movlps    .tex_y1,xmm4
        and       .x1,dx        ; zero x1
      @@:
        mov       ax,.x2
        mov       edx,.x_res ;
        cmp       ax,dx
        cmovg     ax,dx
        ; calc line addres begin in screen and Z buffer
        mov       .x2,ax
        mov       eax,.y
        mul       edx
        movsx     edx,.x1
        add       eax,edx
        shl       eax,2
        add       esi,eax
        add       edi,eax
        mov       cx,.x2
        sub       cx,.x1
        movzx     ecx,cx
        ; init current variables
        movups    xmm0,.r1
        ; movups    xmm1,.dr
        movzx     eax,[draw_flag]
        mov       .dr_cmp,dword 0x03050200   ; possible values
        mov       ah,al                      ; to compare
        mov       .dr_flag1,ax
        mov       .dr_flag2,ax
        xorps     xmm2,xmm2
        movlps    xmm4,.dtex_y
        movlps    xmm5,.tex_y1
        ; mov       .tex_size,TEXTURE_SIZE
        mov       .correct_texf,correct_texf
        ; mov       .t_one,dword the_one
        ; mov       .bi_fil,dword bi_fil2
      .ddraw:
        push      ecx
        movaps    xmm6,xmm0
        shufps    xmm6,xmm6,11111111b
        comiss    xmm6,dword[esi]
        ja        .add_nx
        movss     [esi],xmm6
        movaps    xmm2,xmm5
        cvttps2dq xmm6,xmm5
        movlps    xmm3,.dr_flag1
        pcmpeqb   xmm3,.dr_cmp
        pmovmskb  eax,xmm3
        test      al,11b
        jnz       .only_grd
        cvtdq2ps  xmm7,xmm6
        subps     xmm2,xmm7
        movlps    xmm3,.the_one
        ; xm2 - l-h -> xf, yf
        subps     xmm3,xmm2
        mov       ebx,.correct_texf
        ; xm3 - l-h ->  1-xf 1-yf
        movlhps   xmm3,xmm2  ; xm3: l-h -> 1-xf, 1-yf, xf, yf
        movlhps   xmm2,xmm3  ; xm2: l-h -> xf, yf, 1-xf, 1-yf
        shufps    xmm3,xmm3,10001000b ; l-h: 1-xf xf    1-xf xf
        shufps    xmm2,xmm2,01011111b ; l-h: 1-yf 1-yf  yf   yf
        mulps     xmm2,xmm3  ; xm2: l-h -> w1 - w4 as floats
        mulps     xmm2,[ebx]
     if 0
        mov       eax,0.5
        xorps     xmm2,xmm2
        movd      xmm2,eax
        movlhps   xmm2,xmm3
        shufps    xmm2,xmm2,00001011b
        sub       esp,8
        movlps    [esp],xmm6
        pop       edx eax
        shl       eax,TEX_3D_SHIFT
        add       eax,edx
        and       eax, TEXTURE_3D_SIZE
        shl       eax,2
        mov       edx,eax
        add       eax,tex3d
        add       edx,tex3d
        call      tri_fil
   end if
        lea       ebx,.x_res
        mov       eax,.tex_ptr
        call      bi_fil2
        movlps    xmm3,.dr_flag1
        pcmpeqb   xmm3,.dr_cmp
        pmovmskb  eax,xmm3
        test      al,1100b
        jnz       .only_tex
        cvtps2dq  xmm3,xmm0   ; calc col
        packssdw  xmm3,xmm3
        ; pmaddwd    xmm7,xmm3
        pmullw    xmm7,xmm3
        psrlw     xmm7,8
      .only_tex:
        packuswb  xmm7,xmm7
        movd      [edi],xmm7
        jmp       .add_nx
      .only_grd:
        cvtps2dq  xmm3,xmm0
        packssdw  xmm3,xmm3
        packuswb  xmm3,xmm3
        movss     [edi],xmm3
      .add_nx:
        pop       ecx
        add       edi,4
        add       esi,4
        movups    xmm1,.dr
        addps     xmm5,xmm4
        addps     xmm0,xmm1
        ; loop       .ddraw
        dec       ecx
        jnz       .ddraw
  .quit_l:
        add       esp,170
       ; mov        esp,ebp
        pop       ebp
ret     ; horizontal line
;=====================================================================================
bi_fil2:
       ; bilinear filtering  proc
       ; in:
       ;        ebx  - tex params pack
       ;        xmm6 - tex cooefs
       ;        eax  - tex ptr
       .x_res     equ [ebx]
       .tex_shift equ [ebx+4]
       .tex_x4    equ [ebx+8]
       .tex_size  equ [ebx+12]
       push       eax
       sub        esp,8
       movlps     [esp],xmm6
       pop        edx eax
       push       ecx
       mov        ecx,.tex_shift
       shl        eax,cl
        ; calc texture pixel mem addres
       add        eax,edx
       and        eax,.tex_size  ;TEXTURE_SIZE
       ; cutting
       mov        ecx,.tex_x4
       shr        ecx,2
       mov        edx,eax
       add        edx,ecx        ;TEX_X ;512
       and        edx,.tex_size  ;TEXTURE_SIZE
       shl        edx,2
       shl        eax,2
       pop        ecx
       pop        ebx
       add        eax,ebx ;.tex_ptr
       add        edx,ebx ;.tex_ptr

       movlps     xmm7,[eax]            ; xm7 r1 g1 b1 00 r2 g2 b2 00 as bytes
       movlps     xmm3,[edx]            ; xm3 r3 g3 b3 00 r4 g4 b4 00 as bytes
       punpcklbw  xmm7,xmm3
       ; xm7: r1, r3, g1, g3, b1, b3, 00, 00 :: r2, r4, g2, g4, b2, b4, 00, 00  as bytes
       movhlps    xmm3,xmm7
       punpcklbw  xmm7,xmm3
       ; xm7: r1, r2, r3, r4, g1, g2, g3, g4, b1, b2, b3, b4 00, 00, 00, 00 as bytes
       xorps      xmm1,xmm1
       movhlps    xmm3,xmm7
       punpcklbw  xmm7,xmm1
       punpcklbw  xmm3,xmm1

       cvtps2dq   xmm2,xmm2
       packssdw   xmm2,xmm2

       pmaddwd    xmm7,xmm2     ; xm7: r1*w1 + r2*w2, r3*w3 + r4*w4, g1*w1 + g2*w2, g3*w3 + g4*w4
       pmaddwd    xmm3,xmm2     ; xm3: b1*w1 + b2*w2, b3*w3 + b4*w4
       punpckldq  xmm7,xmm7
       movhlps    xmm1,xmm7

       movlhps    xmm7,xmm3
       psrldq     xmm3,4
       movlhps    xmm1,xmm3
       paddd      xmm7,xmm1
       psrld      xmm7,8
       packssdw   xmm7,xmm7
ret
if 0
; trilinear interpolation attempt ...
tri_fil:
   ;   eax / + TEX_X_3D = col 000, col 001 / col 010, col 011
                        ; incerased / decreased x and y value
   ;   edx / + TEX_X_3D = col 100, col 101 / col 110, col 111
                        ; incerased / decreased x and y value and increased z value
   ;   ebx - ptr to f256
      push     ebp
      mov      ebp,esp
      sub      esp,112

      .x0      equ [ebp-16]    ; I assume textures   ~64x64x64 or  ~32X32X32
      .xyzD    equ [ebp-32]
      .OmxyzD  equ [ebp-48]
      .x4      equ [ebp-64]
      .x5      equ [ebp-80]
      .x6      equ [ebp-96]
      .x7      equ [ebp-112]


      movups    .x4,xmm4
      movups    .x5,xmm5
      movups    .x6,xmm6
      movups    .x0,xmm0

  ; out:
  ;    xmm7     - color as 4 dword integer shl 8

      movaps    xmm3,[ebx]
      mulps     xmm2,xmm3
      subps     xmm3,xmm2
      cvtps2dq  xmm3,xmm3
      cvtps2dq  xmm2,xmm2
      packssdw  xmm3,xmm3
      packssdw  xmm2,xmm2
      movups    .OmxyzD,xmm3
      movups    .xyzD,xmm2
      pshuflw   xmm3,xmm3,0  ; xm3  l part = brdcs 1-xD
      pshuflw   xmm2,xmm2,0  ; xm2  l part = brdcs xD
      ; xm2 = 4x xD as words
      ; xm3 - 4x 1-xD as words
      punpcklwd xmm3,xmm2    ; xm3 = interleaved 1-xD, xd....as words
      xorps     xmm6,xmm6

      movlps    xmm4,[eax]
      movhps    xmm4,[edx]   ; xm4 = lo -> hi c000, c100, c001, c101  as rgbX bytes
                             ; each col 4 bytes
      shufps    xmm4,xmm4,11011000b
  ;    punpcklbw xmm4,xmm5
  ;    punpckldq xmm4,xmm4   ; xm4 = lo -> hi c000, c001, c100, c101  as rgbX bytes
      movhlps   xmm5,xmm4
      punpcklbw xmm4,xmm5    ; xm4 =  r000, r100, b000, b100 ... r001, r101, b001, b101 as bytes
      movhlps   xmm5,xmm4
      punpcklbw xmm4,xmm6    ; xm4 =  r000, r100, b000, b100.... as words
      punpcklbw xmm5,xmm6
      pmaddwd   xmm4,xmm3
      pmaddwd   xmm5,xmm3
      psrld     xmm4,8       ; xm4 =  c00 rgb as dwords integer
      psrld     xmm5,8       ; xm5 =  c01 rgb as dwords integer
      packusdw  xmm4,xmm4
      packusdw  xmm5,xmm5

      movlps    xmm7,[eax+TEX_X_3D]     ; TEX_X_3D
      movhps    xmm7,[edx+TEX_X_3D]
      shufps    xmm7,xmm7,11011000b
                             ; xm7 = lo -> hi c010, c011, c110, c111  as rgbXX bytes
                             ; each col 4 bytes
   ;   punpckldq xmm7,xmm7   ; xm7 = lo -> hi c010, c110, c011, c111  as rgbX bytes
      movhlps   xmm2,xmm7
      punpcklbw xmm7,xmm2    ; xm7 =  r010, r110, b010, b110 ... r011, r111, b011, b111.. as bytes
      movhlps   xmm2,xmm7
      punpcklbw xmm7,xmm6    ; xm7 =  r010, r110, b010, b110.... as words
      punpcklbw xmm2,xmm6
      pmaddwd   xmm7,xmm3
      pmaddwd   xmm2,xmm3
      psrld     xmm7,8       ; xm7 =  c10 rgb as dwords integer
      psrld     xmm2,8       ; xm2 =  c11 rgb as dwords integer
      packusdw  xmm2,xmm2    ; xm2 =  c10 as words
      packusdw  xmm7,xmm7    ; xm7 =  c11 as words

      movups    xmm0,.xyzD
      movups    xmm1,.OmxyzD
      pshuflw   xmm0,xmm0,01010101b    ; xm0  l part = brdcs yD
      pshuflw   xmm1,xmm1,01010101b    ; xm1  l part = brdcs 1-yD
      punpcklwd xmm1,xmm0    ; xm1 =  interleaved 1-yD, yd....as words

      punpcklwd xmm4,xmm7    ; xm4 =  r00, r10, b00, b10.... as words
      pmaddwd   xmm4,xmm1    ; xm4 =  c0
      punpcklwd xmm5,xmm2
      pmaddwd   xmm5,xmm1    ; xm5 =  c1
  if 1
      movups    xmm0,.xyzD
      movups    xmm1,.OmxyzD
      pshuflw   xmm0,xmm0,10101010b  ; xm0  l part = brdcs zD
      pshuflw   xmm1,xmm1,10101010b  ; xm1  l part = brdcs 1-zD
      punpcklwd xmm1,xmm0    ; xm1 =  interleaved 1-zD, zd....as words
      psrld     xmm4,8
      psrld     xmm5,8
      packusdw  xmm5,xmm5
      packusdw  xmm4,xmm4
      punpcklwd xmm5,xmm4
      pmaddwd   xmm5,xmm1    ; xm5 =  color
  end if
      movaps    xmm7,xmm5

      psrld     xmm7,8
      packusdw  xmm7,xmm7

      movups    xmm4,.x4
      movups    xmm5,.x5
      movups    xmm6,.x6
      movups    xmm0,.x0
      ; c00 = c000*(1-xD) + c100*xD
      ; c01 = c001*(1-xD) + c101*xD
      ; c10 = c010*(1-xD) + c110*xD
      ; c11 = c011*(1-xD) + c111*xD
      ; c0  = c00*(1-yD)  + c10*yD
      ; c1  = c01*(1-yD)  + c11*yD
      ; c   = c0*(1-zD)   + c1*zD
      add       esp,127
      pop       ebp
ret
end if
